The following notebook is comprised of 7 primary steps:
import pygeostat as gs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
Load the previously set Matplotlib and Pygeostat settings.
gs.gsPlotStyle.load('../00-Introduction/gsParams.txt')
gs.gsParams.load('../00-Introduction/gsParams.txt')
Recall the grid definition and catdict defaults.
print(gs.gsParams['data.griddef'])
print(gs.gsParams['data.catdict']) ##category dictionary
Grab the grid definition for convenience
griddef = gs.gsParams['data.griddef']
Use a blue-white-red colormap for the continuous variables in this workflow.
gs.gsParams['plotting.cmap'] = 'bwr'
Adjust the trimming limits and null values for the project.
gs.gsParams['data.tmin'] = -998 ## assigns NaN to all the values less than -998
gs.gsParams['data.null'] = -999
Store as the default number of realizations for Pygeostat function calling.
nreal = 20 # can have it at 100, when home
gs.gsParams['data.nreal'] = nreal
You can increase or decrease this based on the number of CPUs on your computer (recommend leaving at least a couple free).
gs.gsParams['config.nprocess'] = 4 # if computer is getting hot set this to 1, if a good comp set to 8
# Path to the CCG executables
exedir = '../exes/'
# Create the output directory
outdir = 'Output/'
gs.mkdir(outdir)
dat = gs.DataFile('../data/PRM_Boundary.dat', griddef=griddef,
cat='Domain Indicator')
print('x and y attributes:', dat.x, dat.y)
print('categorical attribute:', dat.cat)
print('data file type:', dat.dftype)
print(dat.head())
dat.describe()
gs.locmap(dat); ## data is absolutely required here
The indicator variogram is calculated and modeled, since this is required input to calculation of the Gaussian variogram model in the next section (used for distance function $df$ modeling).
Variogram calculation, modeling, plotting and checking are readily accomplished with the variogram object, although unprovided parameters are inferred.
# Initialize the variogram object, var_ind stores the parameters
varg_ind = gs.Variogram(dat, dat.cat, variostd=True, mute=True,
ndim=2, ndir=1, omnihz=True) ## could also do datafl=dat, var=dat.cat, 2 dimentions, omnidirectional variogram=1
# Set required parameters before calculating the experimental
#variogram
varg_ind.update_calcpars(nlags=22, lagdist=30, lagtol=20,
azm=0, azmtol=90, bandhorz=1.0e21)
varg_ind.varcalc() #executing the function
# Set required parameters before modeling the variogram
varg_ind.update_modelpars(c0=0.0, it=1, nst=2) #it type of nested structure, nst 2 - number of structures
varg_ind.fitmodel(maxiter=2500, sill=1.0)
# Plot the variogram
gs.gsParams['plotting.varplt.ms'] = 7 # 7 is marker size
fig = varg_ind.plot(model=True, titles=dat.cat+' Variogram',
figsize=(7, 5))
This cell is not required, but included for explanation of the Variogram object and inferred parameters.
dir(varg_ind) #if we need to see all functions
# Print the applied and inferred parameters that were used in the calculation
print('\nExperimental calculation parameters:')
for key, val in varg_ind.calcpars.items():
print(key+' = ', val)
# Print the first few rows of the experimental variograms table
print('\n Experimental variogram:\n', varg_ind.expvargs[0].head(20)) #number inside head shows number of rows
# Print Variogram Model
print('\n Indicator Variogram Model: {}\n'.format(varg_ind.model))
The Gaussian variogram that yields the indicator variogram after truncation of a Gaussian random field is calculated. This Gaussian variogram is modeled and input to $df$ modeing.
The proportion is required input to bigaus2 below along with the indicator variogram.
proportion = sum(dat['Domain Indicator'])/len(dat)
print('Proportion of inside data: %.3f'%(proportion))
The bigaus2 program applies the Gaussian integration method, given the indicator variogram and the proportion of the indicator.
bigaus2 = gs.Program(program=exedir+'bigaus2')
parstr = """ Parameters for BIGAUS2
**********************
START OF PARAMETERS:
1 -input mode (1) model or (2) variogram file
nofile.out -file for input variogram
{proportion} -threshold/proportion
2 -calculation mode (1) NS->Ind or (2) Ind->NS
{outfl} -file for output of variograms
1 -number of thresholds
{proportion} -threshold cdf values
1 {nlag} -number of directions and lags
{major} 0.0 {lag} -azm(1), dip(1), lag(1)
{varstr}
"""
pars = dict(proportion=proportion, major=varg_ind.major[0],
lag=varg_ind.calcpars['lagdist'][0],
nlag=varg_ind.calcpars['nlags'][0],
outfl=outdir+'bigaus2.out',
varstr=varg_ind.model)
bigaus2.run(parstr=parstr.format(**pars), nogetarg=True)
The bigaus2 program outputs an odd (legacyish) variogram format, which must be translated to the standard Variogram format.
# Read in the data before demonstrating its present form
expvargs = gs.readvarg(outdir+'bigaus2.out', 'all')
expvargs.head()
# Add requisite fields
expvargs['azimuth']=0 #adding fields
expvargs['dip'] = 0
# Rename existing fields to the standard fields (dataframe)
expvargs.rename({'Value':'vario', 'Distance':'h', 'Points':'numpairs'})
# Drop the extraneous fields
expvargs.drop(['Head', 'Tail'])
# Demonstrate the present and required form
expvargs.head()
This model is input to distance function estimation.
# Initialize the variogram Object
varg_gaus = gs.Variogram(variostd=True, mute=True, ndim=2, ndir=1, omnihz=True)
# Assign some calculation parameters, although they are only correspond
# with the previously calculated expvargs in this case (rather than being used
# in the calculation)
varg_gaus.update_calcpars(nlags=22, lagdist=30, azm=20)
varg_gaus.majorexp = expvargs.data
# Set required parameters before modeling the variogram
varg_gaus.update_modelpars(c0=0.01, it=[3,3], nst=2, ang2=[[0,0],[0,0]],
ang3=[[0,0],[0,0]])
varg_gaus.fitmodel(maxiter=2500, sill=1.0) # fitting the model
# Print the variogram model
print('\n Gaussian Variogram Model: {}\n'.format(varg_gaus.model))
# Plot the indicator and Gaussian variogram
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
for varg, ax, title in zip([varg_ind, varg_gaus], axes,
['Indicator Variogram', 'Gaussian Variogram']):
fig = varg.plot(model=True, titles=title, axes=ax)
fig.tight_layout()
The $df$ is calculated at the data locations, before being estimated at the grid locations. The $c$ parameter is applied to the $df$ calculation, defining the bandwidth of uncertainty that will be simulated in the next section.
Normally the optimal $c$ would be calculated using a jackknife study, but it is simply provided here.
selected_c = 50
dfcalc = gs.Program(program=exedir+'dfcalc') # calculate distance function at the data locations
# Print the columns for populating the parameter file without variables
print(dat.columns)
parstr = """ Parameters for DFCalc
*********************
START OF PARAMETERS:
{datafl} -file with input data
1 2 3 0 4 -column for DH,X,Y,Z,Ind
1 -in code: indicator for inside domain
0.0 0.0 0.0 -angles for anisotropy ellipsoid
1.0 1.0 -first and second anisotropy ratios (typically <=1)
0 -proportion of drillholes to remove
696969 -random number seed
{c} -C
{outfl} -file for distance function output
'nofile.out' -file for excluded drillholes output
"""
pars = dict(datafl=dat.flname, c=selected_c,
outfl=outdir+'df_calc.out')
dfcalc.run(parstr=parstr.format(**pars))
A standard naming convention of the distance function variable is used for convenience in the workflow, motivating the manipulation.
# Load the data and note the abbreviated name of the distance function
dat_df = gs.DataFile(outdir+'df_calc.out', notvariables='Ind', griddef=griddef)
print('Initial distance Function variable name = ', dat_df.variables)
# Set a standard distance function name
dfvar = 'Distance Function'
dat_df.rename({dat_df.variables:dfvar}) #renaming, this is the dictionary
print('Distance Function variable name = ', dat_df.variables)
dat_df.shape
# Set symmetric color limits for the distance function
df_vlim = (-350, 350)
gs.locmap(dat_df, vlim=df_vlim, cbar_label='m')
Kriging is performed with a large number of data to provide a smooth and conditionally unbiased estimate. Global kriging would also be appropriate.
kd3dn = gs.Program(program=exedir+'kt3dn')
parstr = """ Parameters for KT3DN
********************
START OF PARAMETERS:
{input_file} -file with data
1 2 3 0 6 0 - columns for DH,X,Y,Z,var,sec var
-998.0 1.0e21 - trimming limits
0 -option: 0=grid, 1=cross, 2=jackknife
xvk.dat -file with jackknife data
1 2 0 3 0 - columns for X,Y,Z,vr and sec var
nofile.out -data spacing analysis output file (see note)
0 15.0 - number to search (0 for no dataspacing analysis, rec. 10 or 20) and composite length
0 100 0 -debugging level: 0,3,5,10; max data for GSKV;output total weight of each data?(0=no,1=yes)
{out_sum} -file for debugging output (see note)
{out_grid} -file for kriged output (see GSB note)
{gridstr}
1 1 1 -x,y and z block discretization
1 100 100 1 -min, max data for kriging,upper max for ASO,ASO incr
0 0 -max per octant, max per drillhole (0-> not used)
700.0 700.0 500.0 -maximum search radii
0.0 0.0 0.0 -angles for search ellipsoid
1 -0=SK,1=OK,2=LVM(resid),3=LVM((1-w)*m(u))),4=colo,5=exdrift,6=ICCK
0.0 0.6 0.8 1.6 - mean (if 0,4,5,6), corr. (if 4 or 6), var. reduction factor (if 4)
0 0 0 0 0 0 0 0 0 -drift: x,y,z,xx,yy,zz,xy,xz,zy
0 -0, variable; 1, estimate trend
extdrift.out -gridded file with drift/mean
4 - column number in gridded file
keyout.out -gridded file with keyout (see note)
0 1 - column (0 if no keyout) and value to keep
{varmodelstr}
"""
pars = dict(input_file=outdir+'df_calc.out',
out_grid=outdir+'kt3dn_df.gsb',
out_sum=outdir+'kt3dn_sum.out',
gridstr=griddef, varmodelstr=varg_gaus.model)
kd3dn.run(parstr=parstr.format(**pars))
pixelplt selects pointvar as the color of the overlain dat_df point data since its name matches the column name of est_df.
est_df = gs.DataFile(flname=outdir+'kt3dn_df.gsb')
# Drop the variance since we won't be using it,
# allowing for specification of the column to be avoided
est_df.drop('EstimateVariance')
# Rename to the standard distance function name for convenience
est_df.rename({est_df.variables:dfvar})
est_df.describe()
# Generate a figure object
fig, axes = gs.subplots(1, 2, figsize=(10, 8),cbar_mode='each',
axes_pad=0.8, cbar_pad=0.1)
# Location map of indicator data for comparison
gs.locmap(dat, ax=axes[0])
# Map of distance function data and estimate
gs.pixelplt(est_df, pointdata=dat_df,
pointkws={'edgecolors':'k', 's':25},
cbar_label='m', vlim=df_vlim, ax=axes[1])
This section is subdivided into 4 sub-sections:
Simulation will generate distance function deviates in the $[-C, C]$ range, which are then added to the $df$ estimate before thresholding at $df=0$. Therefore, any estimates with $df<-c$ are certainly within the boundary (and vice versa) and so, do not need to have their $df$ deviates simulated.
A keyout is therefore generated, which amounts to an indicator where $I=0$ are not simulated and $I=1$ are simulated. Based on the description above $I=1$ when the $df$ estimate is in the $[-C, C]$ range; $I=0$ otherwise.
import copy
# Copy the data file
keyout = copy.deepcopy(est_df)
# Initialize a keyout column with 0s (outside)
keyvar = 'Keyout'
keyout[keyvar] = 0
# Generate an index that is true when df <= selected_c
idx = abs(keyout[dfvar]) <= selected_c
print('{} values in, {} values out'.format(sum(idx), griddef.count()-sum(idx)))
# The keyout is 1 when the idx is true
# This function must be applied to the DataFrame
keyout.data.loc[idx, keyvar] = 1
# Drop the distance function
keyout.drop(dfvar)
# Write out
keyout.writefile(outdir+'keyout.gsb')
# Plot
gs.pixelplt(keyout, cmap='bone_r', catdict={0:'Do not simulate', 1:'Simulate'})
A notably subjective parameter here is the input variogram model, which does not need to match the variogram model used for calculation of the $df$. Its derivation is unclear, but followed visual evaluation of the results ($5 \cdot c$)
simdir = outdir+'USGSIM/'
gs.mkdir(simdir)
usgsim = gs.Program(program=exedir+'usgsim')
varg_h = selected_c*5
parstr = """ Parameters for USGSIM
*********************
START OF MAIN:
1 -number of realizations to generate, 0=kriging
1 -number of variables being simulated
1 -number of rock types to consider
{seed} -random number seed
{gridstr}
{outfl} -file for simulation output
2 - output format: (0=reg, 1=coord, 2=binary)
impute.out -file for imputed values in case of heterotopic samples
1 -debugging level: 0,1,2,3
{dbgfile} -file for debugging output
START OF SRCH:
40 -number of data to use per variable
{varg_h} {varg_h} 10.0 -maximum search radii (hmax,hmin,vert)
0.0 0.0 0.0 -angles for search ellipsoid
1 -sort by distance (0) or covariance (1)
1 1 1 -if sorting by covariance, indicate variogram rock type, head, tail to use
START OF VARG:
1 -number of variograms
1 1 1 -rock type, variable 1, variable 2
1 0.001 - number of structures, nugget effect
3 0.999 0.0 0.0 0.0 - type,variance,ang1,ang2,ang3
{varg_h} {varg_h} 50.0 - a_hmax, a_hmin, a_vert
START OF ROCK:
1 -rock type codes # you might put in your simulated categories
{keyfl} -file with rock types
1 -column for rock type
"""
# Setup parameters in advance of parallel simulation of
# each realization, beginning with a dictionary
# parameters that do not vary
pars = dict(gridstr=griddef, varg_h=varg_h, keyfl=outdir+'keyout.gsb')
# Call pars will be a list of parameters that are applied in parallel
callpars = []
# Random number seeds are initialized with a seed
seeds = gs.rseed_list(nreal, seed=23243)
# Iterate over the realizations, generating a list of
# associated parameter files
for i, seed in enumerate(seeds):
# parameters that vary by realization
pars['seed'] = seed
pars['outfl'] = simdir+'real{}.gsb'.format(i+1)
pars['dbgfile'] =simdir+'real{}.dbg'.format(i+1)
# The parstr is formatted by this iteration of pars,
# which is then appended to the callpars list
callpars.append(dict(parstr=parstr.format(**pars)))
# Simulate in parallel
gs.runparallel(usgsim, callpars, reportprogress=True) #executes all 20 parameter files and executes 20 realizations, executes 4 processes
# Generate a figure for the variable realizations
fig, axes = gs.subplots(2, 2, figsize=(10, 8), cbar_mode='single')
# Iterate over the realizations
for real, ax in enumerate(axes):
sim = gs.DataFile(simdir+'real{}.gsb'.format(real+1))
gs.pixelplt(sim, title='Realization {}'.format(real+1),
cbar_label='Gaussian Deviations', ax=ax, vlim=(-3, 3))
See the section description for additional details.
# Required package for this calculation
from scipy.stats import norm
# Create a directory for the output
domaindir = 'Domains/'
gs.mkdir(domaindir)
# Generate logical numpy index array that identifies the locations
# where simulation has occured
idx = keyout[keyvar].values == 1
for real in range(nreal):
# Load the simulated Gaussian deviates and convert to a
#flat numpy array
sim = gs.DataFile(simdir+'real{}.gsb'.format(real+1))
sim = sim.data.values.flatten()
# Transform the Gaussian deviates to probabilities
sim = norm.cdf(sim[idx], loc=0.0, scale=1.0)
# Transform the probabilities to distance function deviates
sim = 2 *selected_c * sim - selected_c #converted gaussian diviates to be the deviates of the distance functio
# Initialize the final realization as the distance function estimate
df = est_df[dfvar].values
# Add the distance function deviates to the distance function estimate,
# yielding a distance function realization
df[idx] = df[idx] + sim
# If the distance function is greater than 0, the simulated indicator is 1
sim = (df <= 0).astype(int)
# Convert the Numpy array to a Pandas DataFrame, which is required
# for initializing a DataFile (aside from the demonstrated flname approach).
# The DataFile is then written out
sim = pd.DataFrame(data=sim, columns=[dat.cat])
sim = gs.DataFile(data=sim)
sim.writefile(domaindir+'real{}.gsb'.format(real+1))
fig, axes = gs.subplots(2, 2, figsize=(10, 8), cbar_mode='single')
for real, ax in enumerate(axes):
sim = gs.DataFile(domaindir+'real{}.gsb'.format(real+1))
gs.pixelplt(sim, title='Realization {}'.format(real+1),
pointdata=dat,
pointkws={'edgecolors':'k', 's':25},
vlim=(0, 1), ax=ax)
gs.gsParams.save('gsParams.txt')
gs.rmdir(outdir)
gs.rmfile('temp')
The following notebook is comprised of 9 primary steps:
import pygeostat as gs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
Load the Matplotlib and Pygeostat project defaults.
gs.gsPlotStyle.load('../00-Introduction/gsParams.txt')
gs.gsParams.load('../01-Boundary/gsParams.txt')
Assign the number of realization and griddef to variables for convenience.
nreal = gs.gsParams['data.nreal']
print('nreal = {}'.format(nreal))
griddef = gs.gsParams['data.griddef']
griddef
The cmap was set to bwr in the previous section - reset to the Matplotlib default
gs.gsParams['plotting.cmap'] = 'viridis'
# Path to the CCG executables
exedir = '../exes/'
# Domain realizations for keyout
domaindir = '../01-Boundary/Domains/'
# create the output directory
outdir = 'Output/'
gs.mkdir(outdir)
dat = gs.DataFile('../data/PRM_Surface.dat')
print('columns = {}'.format(dat.columns.values))
print('x column = {}, y column = {}'.format(dat.x, dat.y))
The DataFile describe excludes special attributes.
dat.describe()
fig, axes = plt.subplots(2, 2, figsize=(10, 10))
axes = axes.flatten()
for var, ax in zip(dat.variables, axes):
gs.locmap(dat, var=var, cbar_label='m', title=var, ax=ax)
# Remove the unneeded fourth axis
axes[-1].remove()
fig.tight_layout()
Declustering defines the spatially representative distribution of the Top Elevation and Thickness. These distributions are then used for the normal score transformation, generating conditioning data that is used for calculating normal score variograms and conditioning sequential Gaussian simulation.
Note that variograms are often calculated on data that are normal score transformed without declustering weights. Given the controversy of the subject, the more convenient method is selected for this course (since declustering with weights is absolutely required for conditioning).
All variables are homotopically sampled, so only one variable need be considered for declustering.
declus = gs.Program(program=exedir+'declus')
# Cell size based on the data spacing study in the intro notebook
cellsize=90
parstr = """ Parameters for DECLUS
*********************
START OF PARAMETERS:
{datafl} -file with data
{xycol} 0 {varcol} - columns for X, Y, Z, and variable
-1.0e21 1.0e21 - trimming limits
junk.out -file for summary output
{outfl} -file for output with data & weights
1.0 1.0 -Y and Z cell anisotropy (Ysize=size*Yanis)
0 -0=look for minimum declustered mean (1=max)
1 {cellsize} {cellsize} -number of cell sizes, min size, max size
5 -number of origin offsets
"""
pars = dict(datafl=dat.flname, xycol=dat.gscol(dat.xyz),
varcol=dat.gscol('Top Elevation'),
outfl=outdir+'declus.out', cellsize=cellsize)
declus.run(parstr=parstr.format(**pars))
gs.rmfile('junk.out')
In the context of the upcoming modeling steps, Base Elevation is not considered a variable. Use of the notvariables kwarg on initialization excludes it from the variables attribute.
dat = gs.DataFile('Output/declus.out', notvariables='Base Elevation')
print('declustering weight = ', dat.wts)
print('variables = ', dat.variables)
gs.locmap(dat, var=dat.wts)
unscore = gs.Program(program=exedir+'unscore')
parstr = """ Parameters for NSCORE
*********************
START OF PARAMETERS:
{datafl} -file with data
{nvar} {varcols} - number of variables and columns #nscoring both variables simultaniously
{wtcol} - column for weight, 0 if none #declustering weights
0 - column for category, 0 if none
0 - number of records if known, 0 if unknown
-1.0e21 1.0e21 - trimming limits
0 -transform using a reference distribution, 1=yes
../histsmth/histsmth.out -file with reference distribution.
1 2 0 -columns for variable, weight, and category
1000 -maximum number of quantiles, 0 for all
{outfl} -file for output
{trnfl} -file for output transformation table
"""
pars = dict(datafl=dat.flname, nvar=dat.nvar,
varcols=dat.gscol(dat.variables),
wtcol=dat.gscol(dat.wts),
outfl = outdir+'nscore.out', trnfl = outdir+'nscore.trn')
unscore.run(parstr=parstr.format(**pars))
Use the notvariables kwarg leads to isolation of the normal score variables as the dat_ns.variables attribute. Since DataFiles with iniitialized wts are passed to histplt, wt may be True.
dat_ns = gs.DataFile(outdir+'nscore.out',
notvariables=['Base Elevation']+dat.variables)
print('variables = ', dat_ns.variables)
fig, axes = plt.subplots(2 , 2, figsize=(10, 10))
for var, ax in zip(dat.variables, axes[0]):
gs.histplt(dat, var=var, wt=True, ax=ax, stat_blk='minimal')
for var, ax in zip(dat_ns.variables, axes[1]):
gs.histplt(dat_ns, var, wt=True, ax=ax,
xlim=(-3, 3), stat_blk='minimal')
fig.tight_layout()
Normal score variograms are calculated and modeled, before being input to sequential Gaussian simulation in the next section.
Please refer to the Boundary Modeling notebook for additional details about the Variogram object.
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
# Initialize a list that will store the variogram objects
vargs = []
for var, ax in zip(dat_ns.variables, axes):
varg = gs.Variogram(dat_ns, var, ndim=2, ndir=1, omnihz=True,
mute=True)
# Variogram calculation
varg.update_calcpars(nlags=10, lagdist=30.0, lagtol=20.0, azm=0,
azmtol=90, bandhorz=10000, variostd=True)
varg.varcalc()
# Variogram modeling
varg.update_modelpars(c0=0.01, it=[3, 3], nst=2)
varg.fitmodel(maxiter=2500, sill=1.0)
# Variogram plotting
fig = varg.plot(model=True, titles=var, axes=ax, separate_plts=False)
# Append in the list
vargs.append(varg)
fig.tight_layout()
simdir = outdir+'USGSIM/'
gs.mkdir(simdir)
usgsim = gs.Program(program=exedir+'usgsim')
parstr = """ Parameters for USGSIM
*********************
START OF MAIN:
1 -number of realizations to generate, 0=kriging
2 -number of variables being simulated
0 -number of rock types to consider
{seed} -random number seed #important as per Clayton!!!
{griddef}
{outfl} -file for simulation output
2 - output format: (0=reg, 1=coord, 2=binary)
impute.out -file for imputed values in case of heterotopic samples
0 -debugging level: 0,1,2,3
sgsim.dbg -file for debugging output
START OF SRCH:
25 -number of data to use per variable
300 300 10 -maximum search radii (hmax,hmin,vert)
0 0 0 -angles for search ellipsoid
1 -sort by distance (0) or covariance (1)
0 1 1 -if sorting by covariance, indicate variogram rock type, head, tail to use
START OF VARG:
2 -number of variograms
0 1 1 -rock type, variable 1, variable 2
{varmodel1}
0 2 2 -rock type, variable 1, variable 2
{varmodel2}
START OF DATA:
{datafl} -file with primary data
{xycols} 0 0 0 - columns for X,Y,Z,wt,rock type
{varcols} - columns for variables
1 - clip data to grid, 1=yes
1 - assign to the grid, 0=none, 1=nearest, 2=average
-8.0 1.0e21 - trimming limits
"""
pars = dict(griddef=griddef, varmodel1=vargs[0].model,
varmodel2=vargs[1].model, datafl=dat_ns.flname,
xycols=dat_ns.gscol(dat_ns.xyz),
varcols=dat_ns.gscol(dat_ns.variables))
callpars = [] #building a list of parameter files based on the acquired values
seeds = gs.rseed_list(nreal, seed=23243) # make this seed random
for i, seed in enumerate(seeds):
pars['seed'] = seed
pars['outfl'] = simdir+'real{}.gsb'.format(i+1)
callpars.append(dict(parstr=parstr.format(**pars)))
gs.runparallel(usgsim, callpars, reportprogress=True)
This step is not strictly required, but is presented for demonstration.
# Read in the simulation to inspect
sim = gs.DataFile(simdir+'real1.gsb')
# Rename the simulation columns
sim.columns=dat_ns.variables
# Summary statistics
print('\nProperties of the realization:\n', sim.describe())
# The gs.subplots is useful when multiple panels are plotted that
# should use the same colorbar
fig, axes = gs.subplots(1, 2, figsize=(10, 10), cbar_mode='single')
for var, ax in zip(dat_ns.variables, axes):
gs.pixelplt(sim, var=var, vlim=(-3, 3), title=var+' Realization', ax=ax)
Note that ubacktr program only requires a prefix of the transformation table, before it infers the file name based on the number of variables and categories.
backdir = outdir+'UBACKTR/'
gs.mkdir(backdir)
ubacktr = gs.Program(program=exedir+'ubacktr')
parstr = """ Parameters for UBACKTR
**********************
START OF PARAMETERS:
{datafl} -file with simulated Gaussian variables (see Note6)
-7.0 1.0e21 - trimming limits
2 - number of variables
1 2 - columns for variables
0 -number of rocktypes (NRT) (0 if none)
nofile.out - file for simulated RTs (see Note1 and Note6)
5 - column for RT
31 32 34 35 36 37 - RT category codes (see Note2)
{nx} {ny} 1 -nx, ny, nz (0=calculated)(see Note3)
1 -number of realizations
{trnfl} -prefix for trans tables (see Note4 and Note7)
{outfl} -output file (see Note6)
"""
pars = dict(nx=griddef.nx, ny=griddef.ny, trnfl=outdir+'nscore')
callpars = []
for i in range(nreal):
pars['datafl'] = simdir+'real{}.gsb'.format(i+1)
pars['outfl'] = backdir+'real{}.gsb'.format(i+1)
callpars.append(dict(parstr=parstr.format(**pars)))
gs.runparallel(ubacktr, callpars, reportprogress=True)
# Remove the Gaussian realizations since they're no longer needed
gs.rmdir(simdir)
Generate a figure for each variable, where a single color bar is used for multiple realizations.
for var in dat.variables:
fig, axes = gs.subplots(2, 2, figsize=(10, 8), cbar_mode='single')
# Color limits based on the data
vlim = (np.round(dat[var].min()), np.round(dat[var].max()))
for real, ax in enumerate(axes):
sim = gs.DataFile(backdir+'real{}.gsb'.format(real+1))
sim.columns = dat.variables
gs.pixelplt(sim, var=var, title='Realization {}'.format(real+1),
pointdata=dat, pointkws={'edgecolors':'k', 's':25},
vlim=vlim, cbar_label='m', ax=ax)
# Label the figure
fig.suptitle(var, **{'weight':'bold'})
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
for i, (var, ax) in enumerate(zip(dat.variables, axes)):
gs.histpltsim(backdir+'real*.gsb', dat, refvar=var, refwt=True,
refclr='C3',
simvar=i+1, sim_fltype='gsb', title=var, ax=ax)
The variogram of the data in original units must be calculated first, since the normal score variogram was previously calculated.
# Calculate the experimental data variograms
vargs = []
for var, ax in zip(dat_ns.variables, axes):
varg = gs.Variogram(dat_ns, var, ndim=2, ndir=1, omnihz=True,
mute=True)
varg.update_calcpars(nlags=10, lagdist=30.0, lagtol=20.0, azm=0,
azmtol=90, bandhorz=10000, variostd=True)
varg.varcalc()
vargs.append(varg)
# Calculate the variograms of the realizations
for i, varg in enumerate(gs.log_progress(vargs)):
varg.update_simpars(datafl=backdir+'real*.gsb', nvar=1,
varcols=i+1)
varg.varsim(maxlags=50, vargrng=500) # use it to calculaate variogram for each single ralization, * iterates from 1 to the number of realizations
# Plot the variograms
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
for var, varg, ax in zip(dat.variables, vargs, axes):
fig = varg.plot(sim=True, titles=var, colors=['C3'],
figsize=(8, 3), axes=ax)
fig.tight_layout() # when hystogram is well-behaved and simmetric, the variogram is gonna behave well too
Subtract the thickness from the top elevation, providing the base elevation realizations. Output all values within a single directory.
surfdir='Surfaces/'
gs.mkdir(surfdir)
for real in range(nreal):
sim = gs.DataFile(backdir+'real{}.gsb'.format(real+1))
sim.columns=['Top Elevation', 'Thickness']
sim['Base Elevation'] = sim['Top Elevation'] - sim['Thickness']
sim.writefile(surfdir+'real{}.gsb'.format(real+1))
var = 'Base Elevation'
vlim = (np.round(dat[var].min()), np.round(dat[var].max()))
fig, axes = gs.subplots(2, 2, figsize=(10, 8), cbar_mode='single')
for real, ax in enumerate(axes):
sim = gs.DataFile(surfdir+'real{}.gsb'.format(real+1))
gs.pixelplt(sim, var=var, title='Realization {}'.format(real+1),
pointdata=dat, pointkws={'edgecolors':'k', 's':25},
cbar_label='m', ax=ax)
fig.suptitle(var, **{'weight':'bold'})
Output coordinates will be in single precision 'float32' following the gsParams setting below (may be problematic with large utms). This is used here to conserve output file size.
When the DataFile is initialized, it is registered as a structured grid (dftype='sgrid') with specified z coordinates.
Use of a vtk extension in writefile leads to VTK output, where x and y coordinates are assumed to follow the regular grid, whereas z follows irregular coordinates. Note that at least one coordinate must be irregular for sgrid to register as valid.
gs.gsParams['data.write_vtk.cdtype'] = 'float32'
for var in ['Base Elevation', 'Top Elevation']:
sim = gs.DataFile(surfdir+'real1.gsb', z=var, dftype='sgrid')
sim.writefile('{}_real1.vtk'.format(var), variables=[var])
minz = []
maxz = []
for real in range(nreal):
surf = gs.DataFile(surfdir+'real{}.gsb'.format(real+1))
minz.append(np.min(surf['Base Elevation']))
maxz.append(np.max(surf['Top Elevation']))
minz = int(min(minz))
maxz = int(max(maxz)+1)
print('min and max z elevation = {}, {}'.format(minz, maxz))
Set the 3-D grid to the default for the remainder of this notebook (as well as subsequent notebooks).
gridarr = griddef.gridarray()
print('2-d grid array =', gridarr)
# Calculate the z grid attributes
zsiz = 1.0
nz = int(maxz - minz + 1)/zsiz
mnz = minz + zsiz/2.0
# Modify the grid array to reflect the above attributes
gridarr = gridarr[0:6]+(nz, mnz, zsiz)
print('3-d grid array =', gridarr)
# Initialize the 3-D grid, before setting as the default
# going forward
griddef = gs.GridDef(gridarr=gridarr)
gs.gsParams['data.griddef'] = griddef
griddef
Used for constraining subsequent modeling. Only locations that are within the domain (from the previous notebook) and between the top/bottom surfaces need to be simulated.
keydir = 'Keyouts/'
gs.mkdir(keydir)
# Will track the number of realizations that are in the domain
# for each grid node
prob = np.zeros(griddef.count())
for real in gs.log_progress(range(nreal)): # will go through the number of realizations 0,1,2,3...
# Domain
dom = gs.DataFile(domaindir+'real{}.gsb'.format(real+1))
dom = dom.data.values == 1 #converting datafile into a numpy array, ==1 provides a logical array
dom = dom.reshape((griddef.nx, griddef.ny), order='F')
# Top and bottom surfaces
surf = gs.DataFile(surfdir+'real{}.gsb'.format(real+1)) # holds both top and bottom surfaces
top = surf['Top Elevation'].values
top = top.reshape((griddef.nx, griddef.ny), order='F') # reshape top surface to the grid
bot = surf['Base Elevation'].values
bot = bot.reshape((griddef.nx, griddef.ny), order='F')
# Generate the keyout for this realization
keyout = np.zeros((griddef.nx, griddef.ny, griddef.nz)) #creating empty matrix
for ix in range(griddef.nx): #iterates from ix=x to ix=nx
for iy in range(griddef.ny):
if dom[ix, iy]:
# This vertical column is within the aerial domain
# so proceed
x, y, _ = griddef.gridcoord(ix, iy, 1)
_, _, ibot, ingrid = griddef.coord_to_index3d(x, y,
bot[ix, iy])
_, _, itop, ingrid = griddef.coord_to_index3d(x, y,
top[ix, iy])
keyout[ix, iy, ibot:itop+1] = 1
keyout = keyout.reshape(griddef.count(), order='F') # reshapes a 3d array
prob = prob + keyout
# Write out
keyout = gs.DataFile(data=pd.DataFrame(keyout, columns=['Keyout']))
keyout.writefile(keydir+'real{}.gsb'.format(real+1), fmt=[1])
# Calculate the probability to be within the domain
prob = prob / nreal #if inside the domain, probability will have 20 assigned to it, so will have to devide by 20 to have 1
Also, include locations where data are present. This will have a variety of utilities.
keyout = (prob > 0).astype(int)
# Find the indices where 3-D data are located
dat = gs.DataFile('../data/PRM_data.dat')
idx, ingrid = griddef.coord_to_index1d(dat[dat.x], dat[dat.y], dat[dat.z])
# Throw an error if there is data that is not within the grid
assert all(ingrid), print('not all data are in the grid!')
# Assign a keyout of 1
keyout[idx] = 1
# Write out
keyout = gs.DataFile(data=pd.DataFrame(keyout, columns=['Keyout']))
keyout.writefile(keydir+'allreals.gsb', fmt=[1], )
The domain probability is written as the efficient rectilinear grid type since keyout.dftype = 'grid'.
prob = gs.DataFile(data=pd.DataFrame(prob, columns=['Domain Probability']))
prob.writefile('domain_probability.vtk')
gs.gsParams.save('gsParams.txt')
gs.rmdir(outdir)
gs.rmfile('temp')
The following notebook is comprised of 13 primary steps:
import pygeostat as gs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
% matplotlib inline
Load the Matplotlib and Pygeostat project defaults.
gs.gsPlotStyle.load('../00-Introduction/gsParams.txt')
gs.gsParams.load('../02-Structural/gsParams.txt')
gs.gsParams['config.nprocess'] = 1
gs.gsParams['data.nreal'] = 4
Assign nreal and griddef settings to variables for convenience.
nreal = gs.gsParams['data.nreal']
print('nreal = {}'.format(nreal))
griddef = gs.gsParams['data.griddef']
griddef
# Path to the CCG executables
exedir = '../exes/'
# Keyout realizations
keydir = '../02-Structural/Keyouts/'
keyfl_all = keydir+'allreals.gsb'
# create the output directory
outdir = 'Output/'
gs.mkdir(outdir)
Through the use of the following dictionaries, category names and colors do need to be specified in any of the function calls.
# Dictionary of the category codes/names
catdict = {1:'Sand', 2:'Breccia', 3:'SIHS', 4:'MIHS', 5:'Mud'}
gs.gsParams['data.catdict'] = catdict
# Dictionary of category colors
gs.gsParams['plotting.cmap_cat'] = {1:[ 1., 0.95, 0.07], #certain rgb colour codes
2:[0.96, 0.65, 0.11],
3:[0.98, 0.43, 0.09],
4:[0.8, 0.4, 0.],
5:[0.5, 0.5,0.5]}
ncat = len(catdict)
Will inspect the time of this notebook and bottleneck steps.
start_workflow_time = time.time() #timing it cuz it's a lengthy notebook
dat = gs.DataFile('../data/PRM_data.dat')
print('drill hole column = {}'.format(dat.dh))
print('x, y and z columns (composite center) = {} {} {}'.format(dat.x, dat.y, dat.z))
print('drill hole from and to = {}, {}'.format(dat.ifrom, dat.ito))
print('variable columns = {}'.format(list(dat.variables)))
print('category column = {}'.format(dat.cat))
print('category dictionary = {}'.format(dat.catdict))
dat.describe(dat.columns)
The variables will not be used in this notebook, so remove them for simplicity and reduced storage (non-trivial due to the upcoming imputation). This data has already been composited, so the from/to is no longer needed.
dat.drop([dat.ifrom, dat.ito] + dat.variables)
dat.flname = outdir+'categories.dat'
dat.writefile(dat.flname)
dat.describe(dat.columns)
declus = gs.Program(program=exedir+'declus')
parstr = """ Parameters for DECLUS
*********************
START OF PARAMETERS:
{datafl} -file with data
{xyzcols} {catcol} - columns for X, Y, Z, and variable
-98 1.0e21 - trimming limits
declus.sum -file for summary output
{declusfl} -file for output with data & weights
1.0 1.0 -Y and Z cell anisotropy (Ysize=size*Yanis)
0 -0=look for minimum declustered mean (1=max)
1 100 100 -number of cell sizes, min size, max size
5 -number of origin offsets
"""
declusfl = outdir+'declus.out'
pars = dict(datafl=dat.flname, xyzcols=dat.gscol(dat.xyz),
catcol=dat.gscol(dat.cat), declusfl=declusfl)
declus.run(parstr=parstr.format(**pars))
gs.rmfile('declus.sum')
dat_declus = gs.DataFile(declusfl)
Note that repr is a Python standard class function, accessing a formatted string representation of the object (proportions with their description in this case).
fig, axes = gs.subplots(1, 2, figsize=(10, 5), aspect=False)
# Calculate the proportions
proportion = gs.Proportion(dat) #starting class without defined categories
# Print the proportions
print(repr(proportion)+'\n') #representation
# Plot the proportions
proportion.plot(ax=axes[0])
# Repeat the above, but using declustering weights
proportion = gs.Proportion(dat_declus, wt=True)
print(repr(proportion)+'\n')
proportion.plot(ax=axes[1])
Required input to the maketrend program.
dat_ind = dat
names = []
for cat in catdict.keys():
names.append('Indicator for Category '+str(cat))
# The resulting column will be 1 for records that
# match cat, 0 otherwise
dat[names[-1]] = (dat[dat.cat] == cat).astype(int)
# Output for input to maketrend
dat_ind.flname = outdir+'indicators.out'
dat_ind.writefile(dat_ind.flname)
# Reset the variable columns
dat_ind.setvarcols(names)
print('indicator columns', dat_ind.variables)
dat_ind.describe()
maketrend = gs.Program(exedir+'maketrend')
parstr = """ Parameters for MAKETREND
*******************************
START OF PARAMETERS:
{datafl} -file with data
{xyzcols} - columns for X, Y, Z coordinates
{ncat} {indcols} - number of variables, column numbers
-98 1.0e11 - trimming limits
2 -trend type: 0 - vertical; 1 - aerial; 2 - 3D (must be 2)
0 -type of averaging volume: 0 - sphere, 1 - cube
1000.0 1000.0 10.0 -search radii
0.0 0.0 -azimuth and dip
200 -nearest neighbors to use (0 for all) #averaging 200 data
1000 -distance cutoff
0 -weighting function: 0 - equal, 1 - Gaussian
1 3 -apply post processing filter: 1 - yes (must be 0)
5 5 1 -filter size: nx, ny, nz
0 -filter weighting function: 0 - equal, 1 - Gaussian
1 -ensure sum to 1 for compositional trends: 1 - yes
0 -use keyout array? (0=no,1=yes)
nofile - file for keyout
1 1 - column for keyout and value to keep
{trendfl} -file for trend output (must be GSB)
{griddef}
0 -if vertical trend, generate a plot, 1 - yes
vertical.ps -file for postscript output
1 -cumulative curves, 1 - yes
0. 1. -min and max values for x axis
50 -scaling factor for x axis
1 2 -colors for curves, see codes below
Additional Options/Notes
- this version can be applied to large models where a number of cells do
not require estimation. It uses a keyout file, for only estimating the
trend of specified nodes. This is only provided for the 3-D option.
The post-processing filter is not tested with this keyout and may cause errors.
This program only outputs a GSB formatted file with the 3-D option.
- for vertical trends, a 2D/3D model is output if nx and/or ny are > 1
"""
trendfl = outdir+'maketrend.gsb'
pars = dict(datafl=dat_ind.flname,
xyzcols=dat_ind.gscol(dat_ind.xyz),
ncat=ncat,
indcols=dat_ind.gscol(dat_ind.variables),
trendfl=trendfl, griddef=griddef)
maketrend.run(parstr=parstr.format(**pars), liveoutput=False)
All sections will be through the midpoint in each dimension for basic inspection.
# Constant slice numbers
sx = int(griddef.nx/2)
sy = int(griddef.ny/2)
sz = int(griddef.nz/2)
# Aspects and figure sizes
aspects = {'xy':True, 'xz':False, 'yz':False}
figsizes = {'xy':(10, 6), 'xz':(10,4), 'yz':(10,4)}
# Size setting
gs.gsParams['plotting.locmap.s'] = 2
The keyout is used for drawing attention away from areas that won't be simulated (and may have search related artifacts). This keyout could be used as input to the maketrend program, but would prevent the post-processing algorithm.
keyout = gs.DataFile(keyfl_all)
keyout = keyout['Keyout'] == 0
trend = gs.DataFile(trendfl)
trend.data.loc[keyout] = np.nan
for orient, sliceno in zip(['xy', 'xz', 'yz'], [sz, sy, sx]):
aspect, figsize = aspects[orient], figsizes[orient]
fig, axes = gs.subplots(2, 3, cbar_mode='single',
figsize=figsize, aspect=aspect)
for var, ax, cat in zip(trend.variables, axes, catdict.values()):
gs.pixelplt(trend, var=var, pointdata=dat_ind,
ax=ax, sliceno=sliceno, orient=orient,
vlim=(0, 1), title=cat, cbar_label='Proportion')
# Remove the usused axes and add labels as required
gs.subplots_clean(axes, ncat, 3)
fig.suptitle(orient.upper()+' Section')
Since locally varying proportions (and thresholds) will be used within the HTPG framework, the simulated latent Gaussian variables should not truncate to the indicator variograms, but the indicator residual variograms.
Calculate the indicator residuals, before calculating and modeling the associated variograms.
idx, _ = griddef.coord_to_index1d(dat[dat.x], dat[dat.y], dat[dat.z])
names = []
for var in dat_ind.variables:
prop = (trend[var].values)[idx]
names.append(var+' (Local Residual)')
dat_ind[names[-1]] = dat_ind[var] - prop
dat_ind.drop(dat_ind.variables)
dat_ind.setvarcols(names)
dat_ind.variables
Note that is much faster using the varcalc program, where multiple variables are calculated with a single search. Not used here for consistency and convenience
vargs = []
for i, var in enumerate(gs.log_progress(dat_ind.variables)):
varg = gs.Variogram(dat_ind, var=var, ndir=2, omnihz=True, variostd=True,
mute=True)
varg.inferdirections(azm=0.0, dip=0.0, tilt=0.0)
varg.update_calcpars(nlags=(20, 10), lagdist=(10, 1.0), lagtol=(5, .5),
azmtol=90.0, diptol=10.0,
bandhorz=(1.0e21, 10.0), bandvert=3.0 )
varg.varcalc()
vargs.append(varg)
fig, axes = plt.subplots(5, 2, figsize=(8, 15))
for i, (varg, cat, ax) in enumerate(zip(gs.log_progress(vargs), catdict.values(), axes)):
varg.update_modelpars(nst=3, c0=0.0, it=1)
varg.fitmodel(sill=1.0)
varg.plot(model=True, titles=(cat+' Horizontal', cat+' Vertical'), axes=ax)
ax[0].set_xlabel('Lag (m)')
ax[1].set_xlabel('Lag (m)')
fig.tight_layout()
The probability for each facies to transition into each other is calculated next. Along with the previously calculated variograms, the transition probability informs the ordering of the truncation mask.
The intrinsic repr function TransitProb is a formmated string representation of the matrix.
transitprob = gs.TransitProb(dat) #it knows what the category is
transitprob
The plot function is available for improving visualization (relative to the string above).
transitprob.plot(vlim=(0, .2))
Converting the transition probability matrix to a dissimilarity matrix permits the use of MDS mapping, which may simplify understanding.
fig, axes = plt.subplots(1, 2, figsize=(10, 5))
transitprob.plot_dissim(ax=axes[0], vlim=(.8, 1))
transitprob.plot_mds(ax=axes[1], vpad=.05)
The HTPG truncation mask should consider two priorities:
The two priorities are often at odds. In this case, there is no clear order relations according to the transition probabilities, so variogram reproduction will be emphasized by the mask.
A three-level dictionary is used for specifying the dictionary within the HierTPG object that is introduced in the next section.
mask = {'g1': {'t1':{'below': [5, 4, 3, 2], 'above': 1}},
'g2': {'t2':{'below': [5, 4, 3], 'above': 2}},
'g3': {'t3':{'below': [5, 4], 'above': 3}},
'g4': {'t4':{'below': 5, 'above': 4}},
}
Beyond the parameters that have already been prepared, the HTPG framework requires the use of 4 HTPG programs (htpg_thresholds, htpg_gaussvarg, htpg_cat2gauss and htpg_gauss2cat), along with intermittent steps such as latent Gaussian variogram modeling and simulation.
To simplify this process, an HTPG object is provided by Pygeostat for executing these 4 programs and streamlining the intermittent steps. This allows for minimal parameters to be specified with each progressive function.
The object is initiated with the mask, before using its plot function to ensure that the dictionary was properly set.
htpg = gs.HierTPG(mask)
htpg.plot(figsize=(5, 8))
Now that the truncation mask is defined, local thresholds may be calculated based on the locally varying proportions.
The proportion object is used for specifying the global proportions, which are required in addition to the file name containing the local proportion trend.
htpg.thresholds(proportion.props, locpropfl=trend.flname,
locthreshfl=outdir+'localthresh.gsb', exedir=exedir) #we have to put in global and local proportions
The locthreshfl name is appended to the HierTPG object to permit future calculations without its specification, although it may also be applied for user convenience (as will be seen with other attributes and file names).
thresh = gs.DataFile(htpg.locthreshfl)
for orient, sliceno in zip(['xy', 'xz', 'yz'], [sz, sy, sx]):
aspect, figsize = aspects[orient], figsizes[orient]
fig, axes = gs.subplots(2, 2, cbar_mode='single',
figsize=figsize, aspect=aspect)
for var, ax, cat in zip(thresh.variables, axes, catdict.values()):
gs.pixelplt(thresh, var=var, pointdata=dat_ind,
ax=ax, sliceno=sliceno, orient=orient,
vlim=(-3, 3), title=var, cmap='bwr',
cbar_label='Proportion')
fig.suptitle(orient.upper()+' Section')
Given the truncation mask, the variograms of Gaussian random fields that when truncated, will yield the previously calculated indicator (residual) variograms are calculated.
A list of indicator variogram objects, which are required to have fit models (as previously done), are input to the calculation.
htpg.gaussvarg(vargs, exedir=exedir)
htpg.gaussvargs is a list of variogram objects, which have the experimental Gaussian variogram that is optimized to yield the targeted indicator variograms. These variograms are now fit with models that will be input to simulation.
fig, axes = plt.subplots(htpg.ngauss, 2, figsize=(8, 12))
for i, (varg, ax) in enumerate(zip(gs.log_progress(htpg.gaussvargs), axes)):
varg.update_modelpars(nst=3, c0=0.01, it=3)
varg.fitmodel(sill=1.0)
varg.plot(model=True, titles=('Gaussian {} Horizontal'.format(i+1),
'Gaussian {} Vertical'.format(i+1)), axes=ax)
ax[0].set_xlabel('Lag (m)')
ax[1].set_xlabel('Lag (m)')
fig.tight_layout()
Now that the truncation mask, Gaussian variogram models and locally varying thresholds are defined, the Gaussian data realizations may be imputed as the final step prior to simulation of the grid.
Set maximum search radii based on the correlation extent in the Gaussian variograms above. gaussdatadir is the output directory for Gaussian data realizations.
searchradii = [200, 200, 4]
gaussdatadir = outdir+'GaussDataReals/'
gs.mkdir(gaussdatadir)
Time the execution since this may take a few minutes, despite parallel execution within the function.
start_time = time.time()
htpg.cat2gauss(dat, gaussdatadir, searchradii=searchradii,
exedir=exedir)
imptime = (time.time() - start_time)/60
print('execution took {} minutes'.format(imptime))
dat_gauss = gs.DataFile(htpg.gaussdatafls[nreal-1])
print('columns = ', list(dat_gauss.columns))
dat_gauss.describe()
simgaussdir = 'Output/GaussReals/'
gs.mkdir(simgaussdir)
usgsim = gs.Program(program=exedir+'usgsim')
The (potentially intimidating) parstr will be iteratively formatted with variables that are constant, change by variable and change by realization.
Note that the parameter file could be simplified (and execution time reduced) by simulating multiple variables simultaneously, though this generally comes at the expense of variogram reproduction.mm
parstr = """ Parameters for USGSIM
*********************
START OF MAIN:
1 -number of realizations to generate, 0=kriging
1 -number of variables being simulated
1 -number of rock types to consider
{seed} -random number seed
{gridstr}
{outfl} -file for simulation output
2 - output format: (0=reg, 1=coord, 2=binary)
impute.out -file for imputed values in case of heterotopic samples
0 -debugging level: 0,1,2,3
sgsim.dbg -file for debugging output
START OF SRCH:
40 -number of data to use per variable
{searchradii} -maximum search radii (hmax,hmin,vert)
0.0 0.0 0.0 -angles for search ellipsoid
1 -sort by distance (0) or covariance (1)
1 1 1 -if sorting by covariance, indicate variogram rock type, head, tail to use
START OF VARG:
1 -number of variograms
1 1 1 -rock type, variable 1, variable 2
{vargstr}
START OF DATA:
{datafl} -file with primary data
{xyzcols} 0 0 - columns for X,Y,Z,wt,rock type
{varcol} - columns for variables
1 - clip data to grid, 1=yes
1 - assign to the grid, 0=none, 1=nearest, 2=average
-998.5 1.0e21 - trimming limits
START OF ROCK:
1 -rock type codes
{keyfl} -file with rock types
1 -column for rock type
"""
# Parameters that are constant
searchradii = ' '.join([str(i) for i in searchradii])
pars = dict(gridstr=griddef, searchradii=searchradii,
xyzcols=dat_gauss.gscol(dat_gauss.xyz))
callpars = []
iseed = 0
seeds = gs.rseed_list(htpg.ngauss*nreal, seed=212121)
for igauss, (var, varg) in enumerate(zip(dat_gauss.variables,
htpg.gaussvargs)):
# Parameters that change by variable
pars['varcol'] = dat_gauss.gscol(var)
pars['vargstr'] = str(varg.model)
for ireal in range(nreal):
# Parameters that change by realization
pars['datafl'] = htpg.gaussdatafls[ireal]
pars['outfl'] = simgaussdir+\
'real{}_gauss{}.gsb'.format(ireal+1, igauss+1)
pars['keyfl'] = keydir+'real{}.gsb'.format(ireal+1)
pars['seed'] = str(seeds[iseed])
iseed += 1
# Add this parstr to the callpars
callpars.append({'parstr': parstr.format(**pars)})
Not required, but used here for clarity.
print(callpars[-1]['parstr'])
Time the execution since this may take a few minutes, despite parallel execution.
start_time = time.time()
gs.runparallel(usgsim, callpars, reportprogress=True)
simtime = (time.time() - start_time)/60
print('execution took {} minutes'.format(simtime))
The next step will require files that have all latent Gaussian variables in one file, so merge them by realization.
for ireal in gs.log_progress(range(nreal)):
for igauss in range(htpg.ngauss):
outfl = simgaussdir+'real{}_gauss{}.gsb'.format(ireal+1,
igauss+1)
sim = gs.DataFile(outfl)
if igauss == 0:
merge = sim
merge.rename({merge.columns[0]: 'Gaussian 1'})
else:
merge['Gaussian '+str(igauss+1)] = sim.data.values
gs.rmfile(outfl)
merge.writefile(simgaussdir+'real{}.gsb'.format(ireal+1))
del merge
del sim
The final step of the simulation truncates the latent Gaussian variables to generated realizations of the categorical variables.
As seen elsewhere, use of the wildcard specifies that the input/output files contain dedicated realizations that are named 1,...,nreal.
simcatdir = outdir+'Categories/'
gs.mkdir(simcatdir)
htpg.gauss2cat(simgaussdir+'real*.gsb', simcatdir+'real*.gsb', exedir=exedir)
Note that the conditioning data is displayed but is hard to discern (since the values and their spatial variability reproduced). Consider using pointkws={'edgecolors': 'k'} to display the results.
for orient, sliceno in zip(['xy', 'xz', 'yz'], [sz, sy, sz]):
aspect, figsize = aspects[orient], figsizes[orient]
fig, axes = gs.subplots(2, 2, cbar_mode='single',
figsize=figsize, aspect=aspect)
for ireal, ax in enumerate(axes):
sim = gs.DataFile(htpg.simcatfls[ireal])
gs.pixelplt(sim, var=sim.cat, catdata=True,
pointdata=dat, pointvar=dat.cat,
ax=ax, sliceno=sliceno, orient=orient,
title='Realization {}'.format(ireal+1),
cbar_label='Category')
fig.suptitle(orient.upper()+' Section')
With simulation complete, basic properties may now be inspected, including the proportion and variogram reproduction that is displayed here.
The previously intialized Proportion object allows for the rapid calculation of simulation proportions.
proportion.calcsim(simcatdir+'real*.gsb')
print(repr(proportion.simprops)+'\n')
print(repr(proportion.simavgprops))
The Proportion.plotsim function displays a cross-plot between the proportion of the realizations and the declustered data proportions.
proportion.plotsim(figsize=(6, 6))
Correct the simulated category proportions so that they average to the data proportions. A kernel-based algorithm is used for ensuring data reproduction and minizing changing to the realization continuity.
Note that this step is not necessarily recommended in the present scenario, though the tool is demonstrated since it is sometimes required for explicit data reproduction.
# Only the output file name for the corrected proportions is
# required as input
simcatdir = 'Categories/'
gs.mkdir(simcatdir)
proportion.correctsim(simcatdir+'real*.gsb', exedir=exedir)
proportion.plotsim(figsize=(6, 6))
We need to calculate the categorical variograms, since we previously calculated variograms of the indicator residuals.
vargs = []
for i, (catcode, cat) in enumerate(gs.log_progress(catdict.items())):
varg = gs.Variogram(dat, var=dat.cat, ndir=2, omnihz=True,
variostd=True,
mute=True)
varg.inferdirections(azm=0.0, dip=0.0, tilt=0.0)
varg.update_calcpars(variotypes=10,
variocutcat=[[catcode, catcode]],
nlags=(20, 10),
lagdist=(10,1), lagtol=(5, .5),
azmtol=90.0, diptol=10.0,
bandhorz=(1.0e21, 10.0), bandvert=3.0 )
varg.varcalc()
vargs.append(varg)
print('finished', cat)
start_time = time.time()
# Calculate the variograms of the realizations
for i, varg in enumerate(gs.log_progress(vargs)):
varg.update_simpars(datafl=simcatdir+'real*.gsb', varcols=1,
nlags=20)
varg.varsim()
varsimtime = (time.time() - start_time)/60
print('execution took {} minutes'.format(varsimtime))
fig, axes = plt.subplots(5, 2, figsize=(8, 15))
for i, (varg, cat, ax) in enumerate(zip(vargs, catdict.values(), axes)):
varg.plot(sim=True, titles=(cat+' Horizontal', cat+' Vertical'), axes=ax)
ax[0].set_xlabel('Lag (m)')
ax[1].set_xlabel('Lag (m)')
fig.tight_layout()
gs.gsParams.save('gsParams.txt')
gs.rmdir(outdir,verbose=True)
gs.rmfile('temp')
time = (time.time() - start_workflow_time)/60
print('workflow execution took {} minutes'.format(time))
The following notebook is comprised of 7 primary steps:
import pygeostat as gs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from copy import deepcopy
% matplotlib inline
Load the Matplotlib and Pygeostat project defaults.
gs.gsPlotStyle.load('../00-Introduction/gsParams.txt')
gs.set_style(custom={'figure.figsize': (10, 10)})
Assign nreal and griddef settings to variables for convenience.
gs.gsParams.load('../03-Categorical/gsParams.txt')
nreal = gs.gsParams['data.nreal']
print('nreal = {}'.format(nreal))
griddef = gs.gsParams['data.griddef']
griddef
Through the use of the following dictionaries, category names and colors do need to be specified in any of the function calls.
# Dictionary of the category codes/names
catdict = {1:'Sand', 2:'Breccia', 3:'SIHS', 4:'MIHS', 5:'Mud'}
gs.gsParams['data.catdict'] = catdict
ncat = len(catdict)
# Path to the CCG executables
exedir = '../exes/'
# Keyout realizations
catdir = '../03-Categorical/Categories/'
# create the output directory
outdir = 'Output/'
gs.mkdir(outdir)
Will inspect the time of this notebook and bottleneck steps.
start_workflow_time = time.time()
dat = gs.DataFile('../data/PRM_data.dat')
print('drill hole column = {}'.format(dat.dh))
print('x, y and z columns (composite center) = {} {} {}'.format(dat.x, dat.y, dat.z))
print('variable columns = {}'.format(list(dat.variables))) #we dropped them in the previous assignment
print('category column = {}'.format(dat.cat))
print('category dictionary = {}'.format(dat.catdict))
dat.describe()
scatplts is a wrapper on the lower-level scatplt function. The nmax kwarg allows for a random sub-set of the data to be displayed, although underlying statistics consider all passed data.
pad = (-6, -3.25)
nmax = 1500
fig = gs.scatplts(dat, s=5, nmax=nmax, pad=pad)
nvar = dat.nvar
variables = dat.variables
catdict = dat.catdict
cats = catdict.values() #dictionary with 1=sand,2=breccia
ncat = len(cats)
The variables will be modeled in parallel for each category. Therefore, begin the workflow by dividing the variables into data files that are dedicated to each category (simplifies scripting).
# These variables are not required - so drop them
dat.drop([dat.ito, dat.ifrom])
# Record the data file with all categories as data_all
dat_all = deepcopy(dat)
gs.mkdir(outdir+'Data/')
dats = {} #creating a dictionary
for catcode, cat in dat.catdict.items():
# A copy is required to avoid problematic pointer behaviour
dat = deepcopy(dat_all)
# The data is only records where dat.cat equals the current
# category
dat.data = dat.data.loc[dat[dat.cat] == catcode] # loc is a standard indexing method - locating the data category where the rows are = to category code
# Output
dat.flname = outdir+'Data/'+cat+'.dat'
dat.writefile(dat.flname)
if cat == 'Sand' or cat == 'Mud':
# Plot scatter of mud and sand to
# note their differing properties
fig = gs.scatplts(dat, s=5, alpha=.2, c='k',
nmax=nmax, pad=pad)
fig.suptitle(cat, y=1.01, **{'weight': 'bold'})
dats[cat] = dat
Declustering is performed to calculate the representative distribution of each variable. This is then input to the projection pursuit multivariate transformation (PPMT), which transforms the variables to be Gaussian and independent. Spatial despiking is recommended prior to the transformation in practice, though it is avoided here to reduce the number of steps. Short scale spatial noise may be introduced as a result.
This is only executed once per category since the variables are isotopically sampled.
declus = gs.Program(program=exedir+'declus')
parstr = """ Parameters for DECLUS
*********************
START OF PARAMETERS:
{datafl} -file with data
{xyzcols} {varcol} - columns for X, Y, Z, and variable
-98 1.0e21 - trimming limits
declus.sum -file for summary output
{declusfl} -file for output with data & weights
1.0 1.0 -Y and Z cell anisotropy (Ysize=size*Yanis)
0 -0=look for minimum declustered mean (1=max)
1 100 100 -number of cell sizes, min size, max size
10 -number of origin offsets
"""
declusdir = outdir+'Declus/'
gs.mkdir(declusdir)
for cat in catdict.values():
dat = dats[cat]
pars = dict(datafl=dat.flname, xyzcols=dat.gscol(dat.xyz),
varcol=dat.gscol(variables[0]),
declusfl=declusdir+cat+'.out')
declus.run(parstr=parstr.format(**pars), liveoutput=False)
# Re-initialize dat to include the declustering weight
dat = gs.DataFile(declusdir+cat+'.out') # initializing it with this and replacing by the next line
dats[cat] = dat
gs.rmfile('declus.sum')
print('data file name = ', dats['Sand'].flname)
print('declustering attribute = ', dats['Sand'].wts)
The PPMT also outputs normal score data to facilitate the variogram calculation.
ppmt = gs.Program(program=exedir+'ppmt')
parstr = """ Parameters for PPMT
*******************
START OF PARAMETERS:
{datafl} -input data file
{nvar} {varcols} {wtcol} - number of variables, variable cols, and wt col
-5 1.0e7 - trimming limits
100 100 100 -min/max iterations and targeted Gauss perc. (see Note 1)
0 -spatial decorrelation? (0=no,1=yes) (see Note 2)
1 2 0 - x, y, z columns (0=none for z)
50 25 - lag distance, lag tolerance
{nscorefl} -output data file with normal score transformed variables
{ppmtfl} -output data file with PPMT transformed variables
{tablefl} -output transformation table (binary)
"""
# Prepare the output directories
nscoredir = outdir+'Nscore/'
ppmtdir = outdir+'PPMT/'
tabledir = outdir+'Table/'
gs.mkdir([nscoredir, ppmtdir, tabledir])
# Generate the parameter files
callpars = []
for cat in cats:
dat = dats[cat]
pars = dict(datafl=dat.flname, nvar=nvar,
varcols=dat.gscol(variables),
wtcol=dat.gscol(dat.wts),
nscorefl=nscoredir+cat+'.out',
ppmtfl=ppmtdir+cat+'.out',
tablefl=tabledir+cat+'.trn')
callpars.append({'parstr':parstr.format(**pars)})
gs.runparallel(ppmt, callpars, reportprogress=True)
Observe that the KDE of the PPMT transformed data approximates the typical Gaussian contours. Also note that the normal score correlation coefficients to not fully parameterize the displayed distributions, discouraging the use of cosimulation or linear decorrelation methods.
lupad = (-4, -3)
dats_ns, dats_ppmt = {}, {}
for cat in cats:
# Normal score data
dat = gs.DataFile(nscoredir+cat+'.out')
# Drop the untransformed variables to conserve space
dat.drop(variables)
# Rename for convenience
dat.rename({'Nscore:'+var: var for var in variables})
dats_ns[cat] = dat
# PPMT data
dat = gs.DataFile(ppmtdir+cat+'.out')
dat.drop(variables)
dat.rename({'PPMT:'+var: var for var in variables})
dats_ppmt[cat] = dat
# Store since we'll use this modified data outside of
# the notebook for simulation (and want columns to align)
dat.writefile(ppmtdir+cat+'.out')
if cat == 'Sand' or cat == 'Mud':
fig = gs.scatplts_lu(dats_ns[cat], dats_ppmt[cat],
nmax=nmax, pad=lupad, s=2,
titles=('Normal Score', 'PPMT'))
vargs = {}
for cat in gs.log_progress(cats):
vargs[cat] = []
dat = dats_ns[cat]
for i, var in enumerate(variables):
varg = gs.Variogram(dat, var=var, ndir=2, omnihz=True,
variostd=True, mute=True)
varg.inferdirections(azm=0.0, dip=0.0, tilt=0.0)
varg.update_calcpars(nlags=(20, 10), lagdist=(10, 1.0),
lagtol=(5, .5),
azmtol=90.0, diptol=10.0,
bandhorz=(1.0e21, 10.0), bandvert=3.0 )
varg.varcalc()
vargs[cat].append(varg)
Unlike prior auto-fitting cells, a minimum constraint of 25m is placed on every horizontal nested structure below. This constraint was deemed necessary for some models (and does not negatively impact any).
for cat in gs.log_progress(cats):
fig, axes = plt.subplots(nvar, 2, figsize=(8, 3*nvar))
for i, (var, varg, ax) in enumerate(zip(variables,
vargs[cat],
axes)):
varg.update_modelpars(nst=3, c0=0.001, it=1,
ahmax=[[25, 1.0e21],
[25, 1.0e21],
[25, 1.0e21]],
ahmin=[[25, 1.0e21],
[25, 1.0e21],
[25, 1.0e21]])
varg.fitmodel(sill=1.0)
varg.plot(model=True, titles=(var+' Horizontal',
var+' Vertical'), axes=ax)
ax[0].set_xlabel('Lag (m)')
ax[1].set_xlabel('Lag (m)')
fig.tight_layout()
fig.suptitle(cat, y=1.01, **{'fontweight': 'bold'})
The Gaussian variables are independently simulated, before being back-transformed to restore the original units and dependency.
As before, the variables are simulated in dedicated usgsim executions to improve variogram reproduction.
usgsim = gs.Program(program=exedir+'usgsim')
simdir = outdir+'SGSIM/'
gs.mkdir(simdir)
parstr = """ Parameters for USGSIM
*********************
START OF MAIN:
1 -number of realizations to generate, 0=kriging
1 -number of variables being simulated
1 -number of rock types to consider
{seed} -random number seed
{gridstr}
{outfl} -file for simulation output
2 - output format: (0=reg, 1=coord, 2=binary)
impute.out -file for imputed values in case of heterotopic samples
0 -debugging level: 0,1,2,3
sgsim.dbg -file for debugging output
START OF SRCH:
40 -number of data to use per variable
{searchradii} -maximum search radii (hmax,hmin,vert)
0.0 0.0 0.0 -angles for search ellipsoid
1 -sort by distance (0) or covariance (1)
{catcode} 1 1 -if sorting by covariance, indicate variogram rock type, head, tail to use
START OF VARG:
1 -number of variograms
{catcode} 1 1 -rock type, variable 1, variable 2
{vargstr}
START OF DATA:
{datafl} -file with primary data
{xyzcols} 0 {catcol} - columns for X,Y,Z,wt,rock type
{varcol} - columns for variables
1 - clip data to grid, 1=yes
1 - assign to the grid, 0=none, 1=nearest, 2=average
-998.5 1.0e21 - trimming limits
START OF ROCK:
{catcode} -rock type codes
{catfl} -file with rock types
1 -column for rock type
"""
# Parameters that are constant. Grab an arbitrary PPMT
# file for convenience
tdat = dats_ppmt['Sand']
pars = dict(gridstr=griddef, searchradii='100 100 10',
xyzcols=tdat.gscol(tdat.xyz),
catcol=tdat.gscol(tdat.cat))
# Initialize the list that will house nvar*ncat*nreal parameter files
# for parallel execution
callpars = []
iseed = 0
seeds = gs.rseed_list(ncat*nvar*nreal, seed=353535)
for catcode, cat in catdict.items():
# Parameters that change by category
pars['catcode'] = catcode
pars['datafl'] = ppmtdir+cat+'.out'
for var, varg in zip(variables, vargs[cat]):
# Parameters that change by variable
pars['varcol'] = tdat.gscol(var)
pars['vargstr'] = str(varg.model)
for ireal in range(nreal):
# Parameters that change by realization
outfl = simdir+'real{}_{}_{}.gsb'.format(ireal+1, cat,
var)
pars['outfl'] = outfl
pars['catfl'] = catdir+'real{}.gsb'.format(ireal+1)
pars['seed'] = str(seeds[iseed]) # you don't really need the str function here
iseed += 1 # iseed=iseed+1
# Add this parstr to the callpars
callpars.append({'parstr': parstr.format(**pars)})
print(callpars[-1]['parstr'])
gs.runparallel(usgsim, callpars, reportprogress=True)
The simulated variables must be merged for each realization prior to the PPMT back-transformation.
for cat in gs.log_progress(cats):
for ireal in range(nreal):
if (ireal+1 % 10) == 0:
print('working on realization '+ireal+1)
for var in variables:
simfl = simdir+'real{}_{}_{}.gsb'.format(ireal+1,
cat,
var)
sim = gs.DataFile(simfl)
if var == variables[0]:
merge = sim
merge.rename({merge.columns[0]: var})
else:
merge[var] = sim.data.values
#gs.rmfile(simfl)
merge.writefile(simdir+'real{}_{}.gsb'.format(ireal+1,
cat))
del merge
del sim
Apply the PPMT back-transformation, restoring the original units and multivariate dependence to each realization.
backdir = outdir+'PPMT_B/'
gs.mkdir(backdir)
ppmt_b = gs.Program(program=exedir+'ppmt_b')
parstr = """ PPMT Back Transformation
************************
START OF PARAMETERS:
{tablefl} -file with transformation data
{simfl} -file with data to back-transform (GSB detection)
{nvar} {varcols} - nvar, column numbers
-9.0 1.0e21 - trimming limits
{nx} {ny} {nz} 1 - nx, ny, nz, nreal (ignored if nx = 0)
{backfl} -file for back transformed values
0 -enforce N(0,1)? (0=no,1=yes)
"""
pars = dict(nvar=nvar,
varcols=' '.join(str(i+1) for i in range(nvar)),
nx=griddef.nx, ny=griddef.ny, nz=griddef.nz)
callpars = []
for cat in cats:
pars['tablefl'] = tabledir+cat+'.trn'
for ireal in range(nreal):
simfl = simdir+'real{}_{}.gsb'.format(ireal+1, cat)
pars['simfl'] = simfl
backfl = backdir+'real{}_{}.gsb'.format(ireal+1, cat)
pars['backfl'] = backfl
callpars.append({'parstr': parstr.format(**pars)})
gs.runparallel(ppmt_b, callpars, reportprogress=True)
Now that the workflow is complete, merge simulated realizations for each category into a single file (per realization).
finaldir = 'PointScaleRealizations/'
gs.mkdir(finaldir)
# Output as a binary integer and floating point
fmt = [1]+[2]*nvar
# Though not required in the updated pygeostat, this version
# requires tvar (trimming variable) to be floating point
tvar = 'Phi'
for ireal in gs.log_progress(range(nreal)):
simcat = gs.DataFile(catdir+'real{}.gsb'.format(ireal+1))
simvars = {}
for catcode, cat in catdict.items():
backfl = backdir+'real{}_{}.gsb'.format(ireal+1, cat)
simvar = gs.DataFile(backfl)
simvar.columns = variables
simvars[catcode] = simvar
sim = gs.mergemod(list(catdict.keys()), simcat, simvars)
sim.writefile(finaldir+'real{}.gsb'.format(ireal+1), fmt=fmt, tvar=tvar)
# Output one realization as a VTK
sim.writefile('real{}.vtk'.format(ireal+1))
gs.gsParams['data.write_vtk.vdtype'] = 'float32'
sim.writefile('real{}.vtk'.format(ireal+1))
Four realizations are plotted for each variable in each orientation.
# Constant slice numbers
sx = int(griddef.nx/2)
sy = int(griddef.ny/2)
sz = int(griddef.nz/2)
# Aspects and figure sizes
aspects = {'xy':True, 'xz':False, 'yz':False}
figsizes = {'xy':(10, 6), 'xz':(10,4), 'yz':(10,4)}
Construct logical plotting color limits based on the distribution of the data
vlims = {} #defining constant variable limit, dat_all is our datafile, calcualting them as 0.99 quantile and round to 1 decimal
for var in variables:
vlims[var] = (0, np.round(dat_all[var].quantile(0.99), decimals=1))
Store as a pickle for use in future notebooks. Any object may be stored, including dictionaries of multiple objects with this exceptionally useful package. It is used only used once in this course for transparency of input/outputs.
import pickle #can store everything and then load it somewhere
with open('vlims.pkl', 'wb') as f:
pickle.dump(vlims, f) #pickle dumps vlims into the file
for var, ax in zip(variables, axes):
for orient, sliceno in zip(['xy', 'xz', 'yz'], [sz, sy, sx]):
aspect, figsize = aspects[orient], figsizes[orient]
fig, axes = gs.subplots(2, 2, cbar_mode='single',
figsize=figsize, aspect=aspect)
for ireal, ax in zip(range(4), axes):
simfl = finaldir+'real{}.gsb'.format(ireal+1)
sim = gs.DataFile(simfl)
gs.pixelplt(sim, var=var, pointdata=dat_all,
ax=ax, sliceno=sliceno, orient=orient,
title='Realization {}'.format(ireal+1),
vlim=vlims[var])
fig.suptitle(var + ' ' + orient.upper()+' Section',
**{'weight': 'bold'})
Histogram, variogram and multivariate reproduction is now inspected.
Before inspecting histogram reproduction, the reference data distribution must be prepared since the previously calculated distributions (and weights) were by category.
parstr = """ Parameters for DECLUS
*********************
START OF PARAMETERS:
{datafl} -file with data
{xyzcols} {varcol} - columns for X, Y, Z, and variable
-98 1.0e21 - trimming limits
declus.sum -file for summary output
{declusfl} -file for output with data & weights
1.0 1.0 -Y and Z cell anisotropy (Ysize=size*Yanis)
0 -0=look for minimum declustered mean (1=max)
1 100 100 -number of cell sizes, min size, max size
10 -number of origin offsets
"""
pars = dict(datafl=dat_all.flname, xyzcols=dat_all.gscol(dat.xyz),
varcol=dat_all.gscol(variables[0]),
declusfl=declusdir+'all.out')
declus.run(parstr=parstr.format(**pars), liveoutput=True)
dat_all = gs.DataFile(declusdir+'all.out')
gs.rmfile('declus.sum')
fig, axes = plt.subplots(2, 2, figsize=(10, 12))
axes = axes.flatten()
for i, (var, ax) in enumerate(zip(variables, axes)):
gs.histpltsim(finaldir+'real*.gsb', dat_all, refvar=var,
refwt=True, refclr='C3',
simvar=sim.gscol(var, string=False),
sim_fltype='gsb', xlabel=var,
xlim=(0.0, dat_all[var].max()),
ax=ax,)
vargs_all = []
for i, var in enumerate(variables):
varg = gs.Variogram(dat_all, var=var, ndir=2, omnihz=True,
variostd=True, mute=True)
varg.inferdirections(azm=0.0, dip=0.0, tilt=0.0)
varg.update_calcpars(nlags=(20, 10), lagdist=(10, 1.0),
lagtol=(5, .5),
azmtol=90.0, diptol=10.0,
bandhorz=(1.0e21, 10.0), bandvert=3.0 )
varg.varcalc()
vargs_all.append(varg)
start_time = time.time()
# Calculate the variograms of the realizations
for i, (var, varg) in enumerate(zip(variables, vargs_all)):
varg.update_simpars(datafl=finaldir+'real*.gsb',
varcols=sim.gscol(var),
nlags=20)
varg.varsim() #using varsim to calculate variogram realizations
varsimtime = (time.time() - start_time)/60
print('execution took {} minutes'.format(varsimtime))
fig, axes = plt.subplots(nvar, 2, figsize=(8, 3*nvar))
for i, (var, varg, ax) in enumerate(zip(variables,
vargs_all,
axes)):
varg.plot(sim=True, titles=(var+' Horizontal',
var+' Vertical'),
colors=['C3'],
axes=ax)
ax[0].set_xlabel('Lag (m)')
ax[1].set_xlabel('Lag (m)')
fig.tight_layout()
The align_orient option of scatplts_lu allows for alignment of the upper and lower plot axes.
fig = gs.scatplts_lu(dat_all, sim, figsize=(10, 10),
nmax=5000, pad=(-5, -3), s=2,
titles=('Data', 'Realization'),
align_orient=True, stat_xy=(0.95, 0.95))
gs.gsParams.save('gsParams.txt')
gs.rmdir(outdir, verbose=True)
gs.rmfile('temp')
time = (time.time() - start_workflow_time)/60
print('workflow execution took {} minutes'.format(time))
The following notebook is comprised of 7 primary steps:
import pygeostat as gs
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
Load the Matplotlib and Pygeostat project defaults.
gs.gsPlotStyle.load('../00-Introduction/gsParams.txt')
gs.gsParams.load('../04-Multivariate/gsParams.txt')
Assign nreal, catdict and griddef settings to variables for convenience.
nreal = gs.gsParams['data.nreal']
print('nreal = {}'.format(nreal))
catdict = gs.gsParams['data.catdict']
print('catdict =', catdict)
griddef = gs.gsParams['data.griddef']
griddef
Path to the executables, and the directory that will host temporary output files.
exedir = '../exes/'
modeldir = '../04-Multivariate/PointScaleRealizations/'
outdir = 'Output/'
gs.mkdir(outdir)
As is often done in practice, horizontal permeability is inferred based on simulated vertical permetability using Kv-Kh ratios that vary by lithofacies.
As an example, the Sand ratio below indicates that Kh is 5 times larger than its associated Kv.
ratios = {'Sand':5, 'Breccia':3, 'SIHS':2, 'MIHS':1, 'Mud':1} #dictionary - whenever you have sand its gonna be 5 times of Kv, breccia - 3 times of Kv
Output realizations with Kh to a new directory.
pointdir = outdir+'PointScaleRealizations/'
gs.mkdir(pointdir)
Loop over each realization.
for real in gs.log_progress(range(nreal)):
# Load the realization
sim = gs.DataFile(modeldir+'real{}.gsb'.format(real+1))
# Initialize the Kh column as null before adding as variable
sim['Kh'] = np.nan
sim.variables.append('Kh')
# Identify records that match each category, before assigning
# Kh to those records based on multiplying Kv with the approporiate
# record
for catcode, cat in catdict.items():
idx = sim[sim.cat] == catcode #build a logical array simulating the categories
sim.data.loc[idx, 'Kh'] = sim.data.loc[idx, 'Kv']*ratios[cat]
sim.writefile(pointdir + 'real{}.gsb'.format(real+1),
fmt=[1]+[2]*sim.nvar, tvar='Kh')
Define the slice number and figure parameters, consistent with previous notebooks. Note that these parameters could also be stored and loaded with a pickle.
sx = int(griddef.nx / 2.0)
sz = int(griddef.nz / 2.0)
aspects = {'xy':True, 'yz':False}
figsizes = {'xy':(10, 6), 'yz':(10,4)}
Import the color limits that were defined in the last notebook, before adding a Kh entry.
import pickle #we dumped it in the previous notebook using pickle and are using it now pickle.dump and pickle.load
with open('../04-Multivariate/vlims.pkl', 'rb') as f:
vlims = pickle.load(f)
vlims['Kh'] = (0, vlims['Kv'][1]*5)
Compare the Kh and Kv sections.
for var in ['Kh', 'Kv']:
for orient, sliceno in zip(['xy', 'yz'], [sz, sx]):
aspect, figsize = aspects[orient], figsizes[orient]
fig, axes = gs.subplots(2, 2, cbar_mode='single',
figsize=figsize, aspect=aspect)
for ireal, ax in zip(range(4), axes):
simfl = pointdir+'real{}.gsb'.format(ireal+1)
sim = gs.DataFile(simfl)
gs.pixelplt(sim, var=var, vlim=vlims[var],
ax=ax, sliceno=sliceno, orient=orient,
title='Realization {}'.format(ireal+1))
fig.suptitle(var + ' ' + orient.upper() + ' Section',
**{'weight': 'bold'})
The constructed realizations provide a realistic characterization of variability at the scale of the data. These realizations must be converted to a scale that is relevant for engineering decision making, which is accomplished by the averaging of points within blocks of that scale.
Utilize the GridDef.changeblocknum() function to create the block grid definition. Note that this is a relatively coarse grid definition, due to the underlying coarse point grid (unrealistic in practice, but necessary due to time constraints of this course).
# Number of points that will be averaged in each direction
scale_x, scale_y, scale_z = 5, 5, 2
# Initialize the block grid definition
griddef_block = griddef.changeblocknum(nnx=int(griddef.nx/scale_x), #use this function to get to new gridblock definition
nny=int(griddef.ny/scale_y),
nnz=int(griddef.nz/scale_z),
return_copy=True)
griddef_block
Begin by creating a dictionary of the ublkavg (program used below) codes. In practice, flow simulation or proxy models would often be used for upscaling permeability. In this case, simply use (potentially oversimplistic) geometric and harmonic averaging.
avg_methods = {'Arithmetic': 0, 'Geometric': 1,
'Harmonic': 2, 'Majority Rules': 3}
Specify the averaging method code for each variable.
avg_methods = {'Phi': avg_methods['Arithmetic'],
'Vsh': avg_methods['Arithmetic'],
'Sw': avg_methods['Arithmetic'],
'Kv': avg_methods['Harmonic'],
'Kh': avg_methods['Geometric'],
'Categories': avg_methods['Majority Rules']}
avg_methods
Convert to a string, which follows the order that the variables follow in the point scale realizations file.
# Create a list of the method codes in the order that
# the associated variables appear in the point scale
# realizations file (specified by sim.columns)
avg_methods = [avg_methods[col] for col in sim.columns]
# Convert the list to a string that is seperated by ' '
avg_methods = ' '.join([str(i) for i in avg_methods])
avg_methods
This section is required, because water saturation is weighted by the associated porosity in the averaging calculation. Observe that the output has a 2 for the Sw column, indicating that it is weighted by the 2nd column of the input file (Phi).
wtcols = ''
for col in [sim.cat] + sim.variables:
if col == 'Sw':
wtcols += sim.gscol('Phi')
else:
wtcols += '0'
wtcols += ' '
print('weight columns =', wtcols)
blockdir = outdir+'BlockScaleRealizations/'
gs.mkdir(blockdir)
ublkavg = gs.Program(program=exedir+'ublkavg')
parstr = """ Parameters for UBLKAVG
**********************
START OF PARAMETERS
{datafl} - file with input grid (see Note1)
{nvar} - number of variables to average
{varcols} - columns for variables to average
{wtcols} - columns for weights (0 = none)
-998.0 1.0e21 - trimming limits
{gridstr}
{gridstr_block}
1 - number of realizations
0.05 - minimum required volume fraction (see Note2)
{avg_methods} - averaging method (see Note3)
{outfl} - file for output grid (see Note1)
0 - append volume fraction columns?
Note1:
Input/outfiles can be GSLIB (ASCII) or GSB (binary) formatted, although
their format must match. GSLIB is assumed unless a .gsb extension is detected.
Note2:
The program outputs a null value for the upsclaed block if the cumulative
volume of smaller blocks that are used in its average do not meet the
specified fraction of the its total volume. Reasons that the fraction would
be less than 1 include unaligned grids, and null values in the small grid.
Note3:
For each variable, supply one of the following averaging methods:
0 = arithmetic mean
1 = geometric mean
2 = harmonic mean
3 = majority rules (for categorical variables)
"""
pars = dict(nvar=sim.nvar+1,
varcols=sim.gscol([sim.cat] + sim.variables),
wtcols=wtcols, gridstr=griddef,
gridstr_block=griddef_block,
avg_methods=avg_methods)
callpars = []
for real in range(nreal):
pars['datafl'] = pointdir+'real{}.gsb'.format(real+1)
pars['outfl'] = blockdir+'real{}.gsb'.format(real+1)
callpars.append({'parstr': parstr.format(**pars)})
gs.runparallel(ublkavg, callpars, reportprogress=True)
The block grid definition is assigned as the default going forward so that it is automatically used.
gs.gsParams['data.griddef'] = griddef_block
Check the summary statistics of the first realization. The sim variable is used going forward for parameters relating to the block scale realizations.
sim = gs.DataFile(blockdir+'real1.gsb')
sim.describe(sim.columns)
Reset the vlims to the upscaled extents.
for var in vlims:
vlims[var] = (0, np.round(sim[var].quantile(0.99),
decimals=2))
In advance of plotting sections, adjust the slice number since the number of nodes in each direction is reduced.
sx = int(griddef_block.nx/2)
sz = int(griddef_block.nz/2)
Plot the sections.
for var in gs.log_progress(sim.variables):
for orient, sliceno in zip(['xy', 'yz'], [sz, sx]):
aspect, figsize = aspects[orient], figsizes[orient]
fig, axes = gs.subplots(2, 2, cbar_mode='single',
figsize=figsize, aspect=aspect)
for ireal, ax in zip(range(4), axes):
simfl = blockdir+'real{}.gsb'.format(ireal+1)
tsim = gs.DataFile(simfl)
gs.pixelplt(tsim, var=var, ax=ax, sliceno=sliceno,
orient=orient, vlim=vlims[var],
title='Realization {}'.format(ireal+1))
fig.suptitle(var + ' ' + orient.upper() + ' Section',
**{'weight': 'bold'})
Porosity and water saturation are the variables (in addition to reservoir geometry) that impact the resource calculation that follows. The distributions of these varibales are compared before and after upscaling for one realization.
# Define a function that is used for plotting each scale
def hists(simfl, title):
# Set constant xlimits for each variable
xlims = {'Phi': (0, 0.55), 'Sw': (0, 1.1)}
# Plot histograms of both variables
fig, axes = plt.subplots(1 , 2, figsize=(10, 4))
for var, ax in zip(['Phi', 'Sw'], axes):
sim = gs.DataFile(simfl)
gs.histplt(sim, var=var, ax=ax, xlim=xlims[var])
fig.suptitle(title, y=1.01, **{'weight': 'bold'})
fig.tight_layout()
# Call the function with each scale
hists(pointdir+'real1.gsb', 'Point Scale')
hists(blockdir+'real1.gsb', 'Block Scale')
Summaries of the realizations are very useful in practice. The expected value of the realizations (e-type) provides an estimate model that is often necessary some engineering design and operations with current industry technology (e.g., mine planning).
Summaries of uncertainty (e.g., standard deviation of the realizations, P10/P90, probability to be +/-15% of the e-type, etc.) are similarly useful communicating uncertainty and potentially integrating the associated risk into engineering design. That said, these local summaries of uncertainty should not be confused with joint uncertainty, such as the resource calculation in the next section.
This is required pre-processing for execution of upostsim in the next section.
merge_reals_gsb = gs.Program(program=exedir+'merge_reals_gsb')
parstr = """ Parameters for MERGE_REALS_GSB #sticking all the realizations together and putting them into one file
*******************************
START OF MAIN:
{datafl} - (str) base file name including sub directory if needed
{nvar} - (int) number of variables
{varcols} - (int) column number for value(s)
1 - (int) starting realization
{nreal} - (int) Last realization to read
{outfl} - (str) output file name
1 - (int) output variable to trim
-8.0 1.0e21 - (float, float) tmin and tmax for trimming
"""
blockfl = blockdir+'allreals.gsb'
pars = dict(datafl=blockdir+'real', outfl=blockfl,
nvar=sim.nvar+1,
varcols=sim.gscol([sim.cat] + sim.variables),
nreal=nreal)
merge_reals_gsb.run(parstr=parstr.format(**pars))
This program provides virtually any desired summary of local uncertainty. Note that the P10 and P90 statistics are ill defined based on only 20 realizations (more so than the e-type and standard deviation).
upostsim = gs.Program(program=exedir+'upostsim')
parstr = """ Parameters for UPOSTSIM
***********************
START OF PARAMETERS:
{datafl} -input file name with the simulated variable
{varcol} - column for the variable
-998.0 1.0e21 - trimming limits
{nreal} -nreal (grid size taken from GSB header)
3 -number of output options
1 2 3 - output option codes (see below)
1 - if 1, variance (0) or standard deviation (1)
1 15 - if 2, num. of percentages and values (1 to 49)
2 10 90 - if 3, num. of percentiles and values (1 to 99)
2 1.0 1.5 - if 4, num. of cutoffs and values
3 0.25 0.5 0.75 - if 5, num. of tolerances and values
{outfl} -output file name
option 1 = E-type mean and conditional variance
2 = prob to be within X% of the mean
3 = percentiles
4 = prob to be above cutoff
5 = prob to be within absolute tolerance of the mean
6 = number of simulated values per grid node
NOTE: Must use GSB file formats for input and output for now.
"""
pars = dict(datafl=blockfl, nreal=nreal)
callpars = []
for var in sim.variables:
pars['varcol'] = sim.gscol(var)
pars['outfl'] = blockdir+'postsim_{}.gsb'.format(var)
callpars.append({'parstr': parstr.format(**pars)})
gs.runparallel(upostsim, callpars, reportprogress=True)
simstat = gs.DataFile(blockdir+'postsim_{}.gsb'.format(var))
simstat.describe()
figsizes = {'xy':(10, 4), 'yz':(10, 2)}
for var in gs.log_progress(sim.variables):
# Load the summaries for this variable
simstat = gs.DataFile(blockdir+'Postsim_{}.gsb'.format(var))
# Provide some better names for plotting
simstat.rename({'Cond.Stdev.': 'Standard Deviation',
'Prob.ToBeWithin15%OfTheMean':
'Probability to be +/-15% of the E-type',
'ValueOfThe10thPercentile': 'P10',
'ValueOfThe90thPercentile': 'P90'})
for orient, sliceno in zip(['xy', 'yz'], [sz, sx]):
aspect, figsize = aspects[orient], figsizes[orient]
# E-type and conditional standard deviation
fig, axes = gs.subplots(1, 2, figsize=figsize, axes_pad=.8,
cbar_mode='each', aspect=aspect)
for stat, ax in zip(['Standard Deviation',
'Probability to be +/-15% of the E-type'],
axes):
vlim = (np.round(simstat[stat].quantile(.05), decimals=2),
np.round(simstat[stat].quantile(.95), decimals=2))
gs.pixelplt(simstat, var=stat, vlim=vlim,
ax=ax, sliceno=sliceno,
orient=orient, title=stat)
fig.suptitle(var+' '+orient.upper()+' Section', x=0.5, y=1.08,
**{'weight': 'bold'})
# P10 and P90
fig, axes = gs.subplots(1, 3, figsize=figsize,
cbar_mode='single', aspect=aspect)
for stat, ax in zip(['P10', 'E-type', 'P90'], axes):
gs.pixelplt(simstat, var=stat, vlim=vlims[var],
ax=ax, sliceno=sliceno,
orient=orient, title=stat)
The economic deposit is simplified here, to block columns that have a vertical average of $\phi > 0.5$ and $S_w < 0.5$ within the deposit.
This grid definition consists of a single vertical block.
griddef_map = griddef_block.changeblocknum(nnx=griddef_block.nx, #define another grid definition
nny=griddef_block.ny,
nnz=1,
return_copy=True)
griddef_map
mapdir = outdir+'MapRealizations/'
gs.mkdir(mapdir)
Use the ublkavg program that was previously initialized above.
parstr = """ Parameters for UBLKAVG
**********************
START OF PARAMETERS
{datafl} - file with input grid (see Note1)
2 - number of variables to average
{varcols} - columns for variables to average
{wtcols} - columns for weights (0 = none)
-998.0 1.0e21 - trimming limits
{gridstr_block}
{gridstr_map}
1 - number of realizations
0.05 - minimum required volume fraction (see Note2)
0 0 - averaging method (see Note3)
{outfl} - file for output grid (see Note1)
0 - append volume fraction columns?
Note1:
Input/outfiles can be GSLIB (ASCII) or GSB (binary) formatted, although
their format must match. GSLIB is assumed unless a .gsb extension is detected.
Note2:
The program outputs a null value for the upsclaed block if the cumulative
volume of smaller blocks that are used in its average do not meet the
specified fraction of the its total volume. Reasons that the fraction would
be less than 1 include unaligned grids, and null values in the small grid.
Note3:
For each variable, supply one of the following averaging methods:
0 = arithmetic mean
1 = geometric mean
2 = harmonic mean
3 = majority rules (for categorical variables)
"""
pars = dict(varcols=sim.gscol(['Phi', 'Sw']),
wtcols='0 ' + sim.gscol('Phi'),
gridstr_block=griddef_block,
gridstr_map=griddef_map)
callpars = []
for real in range(nreal):
pars['datafl'] = blockdir+'real{}.gsb'.format(real+1)
pars['outfl'] = mapdir+'real{}.gsb'.format(real+1)
callpars.append(dict(parstr=parstr.format(**pars)))
gs.runparallel(ublkavg, callpars, reportprogress=True)
gs.gsParams['data.griddef'] = griddef_map
sim = gs.DataFile(mapdir + 'real1.gsb')
sim.describe()
When a block has $\phi > 0.15$ and $S_w < 0.5$ for a realization, 1 is added to the prob array below. The prob array is then divided by nreal providing an array that provides the probability to be within the deposit.
prob = np.zeros(griddef_map.count())
for real in range(nreal):
sim = gs.DataFile(mapdir + 'real{}.gsb'.format(real+1))
idx = ((sim['Phi'] > 0.15) & (sim['Sw'] < 0.5)) # simultaniously = &
prob += idx.astype(int) #true is converted to 1 and false is converted to 0
prob = prob / nreal
prob = gs.DataFile(data=pd.DataFrame(prob, columns=['Deposit Probability']))
# pd.DataFrame(prob, columns=['Deposit Probability'])
gs.pixelplt(prob)
Unlike local uncertainty summaries in the previous sections, calculating the global resource uncertainty requires consideration of the joint block values in each realization (joint uncertainty).
The original-oil-in-place ($OOIP$) is calculated for each realization as:
$OOIP = \Sigma_{i=1}^N(1 - S_w(\mathbf{u}_i)) \cdot \phi(\mathbf{u}_i) \cdot V$
where $S_w$ is water saturation, $\phi$ is porosity, and $V$ is the block volume.
ooips = []
for real in range(nreal):
sim = gs.DataFile(blockdir + 'real{}.gsb'.format(real+1))
ooip = (1 - sim['Sw']) * sim['Phi'] * griddef_block.blockvolume()
ooips.append(ooip.sum())
An xlabel must be provided since ooips is a list, not a DataFile or DataFrame with names attached.
gs.histplt(ooips, title='Resource Distribution', xlabel='OOIP (m^3)',
icdf=True)
gs.rmdir(outdir)
gs.rmfile('temp')